\[\\[0.01in]\]
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.0
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
batting <- read.csv("/Users/aditewari/Desktop/R-Course/R-for-Data-Science-and-Machine-Learning/Training Exercises/Capstone and Data Viz Projects/Capstone Project/Batting.csv")
head(batting)
## playerID yearID stint teamID lgID G G_batting AB R H X2B X3B HR RBI SB CS
## 1 aardsda01 2004 1 SFN NL 11 11 0 0 0 0 0 0 0 0 0
## 2 aardsda01 2006 1 CHN NL 45 43 2 0 0 0 0 0 0 0 0
## 3 aardsda01 2007 1 CHA AL 25 2 0 0 0 0 0 0 0 0 0
## 4 aardsda01 2008 1 BOS AL 47 5 1 0 0 0 0 0 0 0 0
## 5 aardsda01 2009 1 SEA AL 73 3 0 0 0 0 0 0 0 0 0
## 6 aardsda01 2010 1 SEA AL 53 4 0 0 0 0 0 0 0 0 0
## BB SO IBB HBP SH SF GIDP G_old
## 1 0 0 0 0 0 0 0 11
## 2 0 0 0 0 1 0 0 45
## 3 0 0 0 0 0 0 0 2
## 4 0 1 0 0 0 0 0 5
## 5 0 0 0 0 0 0 0 NA
## 6 0 0 0 0 0 0 0 NA
str(batting)
## 'data.frame': 97889 obs. of 24 variables:
## $ playerID : chr "aardsda01" "aardsda01" "aardsda01" "aardsda01" ...
## $ yearID : int 2004 2006 2007 2008 2009 2010 2012 1954 1955 1956 ...
## $ stint : int 1 1 1 1 1 1 1 1 1 1 ...
## $ teamID : chr "SFN" "CHN" "CHA" "BOS" ...
## $ lgID : chr "NL" "NL" "AL" "AL" ...
## $ G : int 11 45 25 47 73 53 1 122 153 153 ...
## $ G_batting: int 11 43 2 5 3 4 NA 122 153 153 ...
## $ AB : int 0 2 0 1 0 0 NA 468 602 609 ...
## $ R : int 0 0 0 0 0 0 NA 58 105 106 ...
## $ H : int 0 0 0 0 0 0 NA 131 189 200 ...
## $ X2B : int 0 0 0 0 0 0 NA 27 37 34 ...
## $ X3B : int 0 0 0 0 0 0 NA 6 9 14 ...
## $ HR : int 0 0 0 0 0 0 NA 13 27 26 ...
## $ RBI : int 0 0 0 0 0 0 NA 69 106 92 ...
## $ SB : int 0 0 0 0 0 0 NA 2 3 2 ...
## $ CS : int 0 0 0 0 0 0 NA 2 1 4 ...
## $ BB : int 0 0 0 0 0 0 NA 28 49 37 ...
## $ SO : int 0 0 0 1 0 0 NA 39 61 54 ...
## $ IBB : int 0 0 0 0 0 0 NA NA 5 6 ...
## $ HBP : int 0 0 0 0 0 0 NA 3 3 2 ...
## $ SH : int 0 1 0 0 0 0 NA 6 7 5 ...
## $ SF : int 0 0 0 0 0 0 NA 4 4 7 ...
## $ GIDP : int 0 0 0 0 0 0 NA 13 20 21 ...
## $ G_old : int 11 45 2 5 NA NA NA 122 153 153 ...
head(batting$AB)
## [1] 0 2 0 1 0 0
head(batting$X2B)
## [1] 0 0 0 0 0 0
\[\\[0.01in]\]
batting$BA <- batting$H/batting$AB
tail(batting$BA)
## [1] 0.0000000 0.1230769 0.2746479 0.1470588 0.2745098 0.2138728
\[\\[0.01in]\]
attach(batting) #This way we don't have to keep on saying batting
batting$OBP = (H+BB+HBP)/(AB+BB+HBP+SF)
tail(batting$OBP)
## [1] 0.0000000 0.1343284 0.3443918 0.1470588 0.3543759 0.2901554
\[\\[0.01in]\]
attach(batting) #This way we don't have to keep on saying batting
batting$X1B <- (H-X2B-X3B-HR)
tail(batting$AB)
## [1] 2 65 568 34 612 173
attach(batting)
batting$SLG <- (X1B + X2B*2 + X3B*3 + HR*4)/AB
tail(batting$SLG)
## [1] 0.0000000 0.1384615 0.4647887 0.1470588 0.4019608 0.3294798
sal <- read.csv("/Users/aditewari/Desktop/R-Course/R-for-Data-Science-and-Machine-Learning/Training Exercises/Capstone and Data Viz Projects/Capstone Project/Salaries.csv")
head(sal)
## yearID teamID lgID playerID salary
## 1 1985 BAL AL murraed02 1472819
## 2 1985 BAL AL lynnfr01 1090000
## 3 1985 BAL AL ripkeca01 800000
## 4 1985 BAL AL lacyle01 725000
## 5 1985 BAL AL flanami01 641667
## 6 1985 BAL AL boddimi01 625000
batting <- filter(batting,yearID >= 1985)
summary(batting)
## playerID yearID stint teamID
## Length:35652 Min. :1985 Min. :1.00 Length:35652
## Class :character 1st Qu.:1993 1st Qu.:1.00 Class :character
## Mode :character Median :2000 Median :1.00 Mode :character
## Mean :2000 Mean :1.08
## 3rd Qu.:2007 3rd Qu.:1.00
## Max. :2013 Max. :4.00
##
## lgID G G_batting AB
## Length:35652 Min. : 1.0 Min. : 0.00 Min. : 0.0
## Class :character 1st Qu.: 14.0 1st Qu.: 4.00 1st Qu.: 3.0
## Mode :character Median : 34.0 Median : 27.00 Median : 47.0
## Mean : 51.7 Mean : 46.28 Mean :144.7
## 3rd Qu.: 77.0 3rd Qu.: 77.00 3rd Qu.:241.0
## Max. :163.0 Max. :163.00 Max. :716.0
## NA's :1406 NA's :4377
## R H X2B X3B
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 4.00 Median : 8.00 Median : 1.000 Median : 0.000
## Mean : 19.44 Mean : 37.95 Mean : 7.293 Mean : 0.824
## 3rd Qu.: 30.00 3rd Qu.: 61.00 3rd Qu.:11.000 3rd Qu.: 1.000
## Max. :152.00 Max. :262.00 Max. :59.000 Max. :23.000
## NA's :4377 NA's :4377 NA's :4377 NA's :4377
## HR RBI SB CS
## Min. : 0.000 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 3.00 Median : 0.000 Median : 0.000
## Mean : 4.169 Mean : 18.41 Mean : 2.811 Mean : 1.219
## 3rd Qu.: 5.000 3rd Qu.: 27.00 3rd Qu.: 2.000 3rd Qu.: 1.000
## Max. :73.000 Max. :165.00 Max. :110.000 Max. :29.000
## NA's :4377 NA's :4377 NA's :4377 NA's :4377
## BB SO IBB HBP
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 1.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 3.00 Median : 12.00 Median : 0.000 Median : 0.000
## Mean : 14.06 Mean : 27.03 Mean : 1.171 Mean : 1.273
## 3rd Qu.: 21.00 3rd Qu.: 42.00 3rd Qu.: 1.000 3rd Qu.: 1.000
## Max. :232.00 Max. :223.00 Max. :120.000 Max. :35.000
## NA's :4377 NA's :4377 NA's :4378 NA's :4387
## SH SF GIDP G_old
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 11.0
## Median : 0.000 Median : 0.000 Median : 1.00 Median : 32.0
## Mean : 1.465 Mean : 1.212 Mean : 3.25 Mean : 49.7
## 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.: 5.00 3rd Qu.: 77.0
## Max. :39.000 Max. :17.000 Max. :35.00 Max. :163.0
## NA's :4377 NA's :4378 NA's :4377 NA's :5189
## BA OBP X1B SLG
## Min. :0.000 Min. :0.000 Min. : 0.00 Min. :0.000
## 1st Qu.:0.136 1st Qu.:0.188 1st Qu.: 0.00 1st Qu.:0.167
## Median :0.233 Median :0.296 Median : 6.00 Median :0.333
## Mean :0.205 Mean :0.262 Mean : 25.66 Mean :0.304
## 3rd Qu.:0.274 3rd Qu.:0.342 3rd Qu.: 42.00 3rd Qu.:0.423
## Max. :1.000 Max. :1.000 Max. :225.00 Max. :4.000
## NA's :8905 NA's :8821 NA's :4377 NA's :8905
combo <- merge(batting,sal, by=c("yearID","playerID"))
summary(combo)
## yearID playerID stint teamID.x
## Min. :1985 Length:25397 Min. :1.000 Length:25397
## 1st Qu.:1993 Class :character 1st Qu.:1.000 Class :character
## Median :1999 Mode :character Median :1.000 Mode :character
## Mean :1999 Mean :1.098
## 3rd Qu.:2006 3rd Qu.:1.000
## Max. :2013 Max. :4.000
##
## lgID.x G G_batting AB
## Length:25397 Min. : 1.00 Min. : 0.00 Min. : 0.0
## Class :character 1st Qu.: 26.00 1st Qu.: 8.00 1st Qu.: 5.0
## Mode :character Median : 50.00 Median : 42.00 Median : 85.0
## Mean : 64.06 Mean : 57.58 Mean :182.4
## 3rd Qu.:101.00 3rd Qu.:101.00 3rd Qu.:336.0
## Max. :163.00 Max. :163.00 Max. :716.0
## NA's :906 NA's :2661
## R H X2B X3B
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 1.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 9.00 Median : 19.00 Median : 3.000 Median : 0.000
## Mean : 24.71 Mean : 48.18 Mean : 9.276 Mean : 1.033
## 3rd Qu.: 43.00 3rd Qu.: 87.25 3rd Qu.:16.000 3rd Qu.: 1.000
## Max. :152.00 Max. :262.00 Max. :59.000 Max. :23.000
## NA's :2661 NA's :2661 NA's :2661 NA's :2661
## HR RBI SB CS
## Min. : 0.000 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 1.000 Median : 8.00 Median : 0.000 Median : 0.00
## Mean : 5.369 Mean : 23.56 Mean : 3.568 Mean : 1.54
## 3rd Qu.: 7.000 3rd Qu.: 39.00 3rd Qu.: 3.000 3rd Qu.: 2.00
## Max. :73.000 Max. :165.00 Max. :110.000 Max. :29.00
## NA's :2661 NA's :2661 NA's :2661 NA's :2661
## BB SO IBB HBP
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 2.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 6.00 Median : 20.00 Median : 0.000 Median : 0.000
## Mean : 17.98 Mean : 33.52 Mean : 1.533 Mean : 1.614
## 3rd Qu.: 29.00 3rd Qu.: 55.00 3rd Qu.: 2.000 3rd Qu.: 2.000
## Max. :232.00 Max. :223.00 Max. :120.000 Max. :35.000
## NA's :2661 NA's :2661 NA's :2662 NA's :2670
## SH SF GIDP G_old
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 20.00
## Median : 0.000 Median : 0.000 Median : 2.000 Median : 47.00
## Mean : 1.786 Mean : 1.554 Mean : 4.127 Mean : 61.43
## 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.: 7.000 3rd Qu.:101.00
## Max. :39.000 Max. :17.000 Max. :35.000 Max. :163.00
## NA's :2661 NA's :2662 NA's :2661 NA's :3414
## BA OBP X1B SLG
## Min. :0.000 Min. :0.000 Min. : 0.0 Min. :0.000
## 1st Qu.:0.160 1st Qu.:0.208 1st Qu.: 0.0 1st Qu.:0.200
## Median :0.242 Median :0.305 Median : 13.0 Median :0.351
## Mean :0.212 Mean :0.270 Mean : 32.5 Mean :0.317
## 3rd Qu.:0.276 3rd Qu.:0.346 3rd Qu.: 59.0 3rd Qu.:0.432
## Max. :1.000 Max. :1.000 Max. :225.0 Max. :4.000
## NA's :5618 NA's :5562 NA's :2661 NA's :5618
## teamID.y lgID.y salary
## Length:25397 Length:25397 Min. : 0
## Class :character Class :character 1st Qu.: 255000
## Mode :character Mode :character Median : 550000
## Mean : 1879256
## 3rd Qu.: 2150000
## Max. :33000000
##
\[\\[0.01in]\]
lost_players <- combo %>% filter(playerID %in% c("giambja01","damonjo01","saenzol01")) %>% filter(yearID==2001)
lost_players
## yearID playerID stint teamID.x lgID.x G G_batting AB R H X2B X3B HR
## 1 2001 damonjo01 1 OAK AL 155 155 644 108 165 34 4 9
## 2 2001 giambja01 1 OAK AL 154 154 520 109 178 47 2 38
## 3 2001 saenzol01 1 OAK AL 106 106 305 33 67 21 1 9
## RBI SB CS BB SO IBB HBP SH SF GIDP G_old BA OBP X1B SLG
## 1 49 27 12 61 70 1 5 5 4 7 155 0.2562112 0.3235294 118 0.3633540
## 2 120 2 0 129 83 24 13 0 9 17 154 0.3423077 0.4769001 91 0.6596154
## 3 32 0 1 19 64 1 13 1 3 9 106 0.2196721 0.2911765 36 0.3836066
## teamID.y lgID.y salary
## 1 OAK AL 7100000
## 2 OAK AL 4103333
## 3 OAK AL 290000
\[\\[0.01in]\]
lost_players
but we only need playerID,H,X2B,X3B,HR,OBP,SLG,BA,AB so
lets further clean our datalost_players <- select(lost_players,c("playerID","H","X2B","X3B","HR","OBP","SLG","BA","AB"))
lost_players
## playerID H X2B X3B HR OBP SLG BA AB
## 1 damonjo01 165 34 4 9 0.3235294 0.3633540 0.2562112 644
## 2 giambja01 178 47 2 38 0.4769001 0.6596154 0.3423077 520
## 3 saenzol01 67 21 1 9 0.2911765 0.3836066 0.2196721 305
\[\\[0.01in]\]
\[\\[0.01in]\]
lp_AB <- sum(lost_players$AB)
lp_AB
## [1] 1469
lp_meanAB <- (lp_AB/length(lost_players$playerID)) #This is the mean of the AB
lp_meanAB
## [1] 489.6667
\[\\[0.01in]\]
lp_meanOBP <-mean(lost_players$OBP)
lp_meanOBP
## [1] 0.3638687
\[\\[0.01in]\]
potential <- combo %>% filter(yearID==2001) %>% filter(OBP>lp_meanOBP) %>% filter(AB>lp_meanAB)
head(potential)
## yearID playerID stint teamID.x lgID.x G G_batting AB R H X2B X3B HR
## 1 2001 abreubo01 1 PHI NL 162 162 588 118 170 48 4 31
## 2 2001 alomaro01 1 CLE AL 157 157 575 113 193 34 12 20
## 3 2001 aloumo01 1 HOU NL 136 136 513 79 170 31 1 27
## 4 2001 aurilri01 1 SFN NL 156 156 636 114 206 37 5 37
## 5 2001 bagweje01 1 HOU NL 161 161 600 126 173 43 4 39
## 6 2001 berkmla01 1 HOU NL 156 156 577 110 191 55 5 34
## RBI SB CS BB SO IBB HBP SH SF GIDP G_old BA OBP X1B SLG
## 1 110 36 14 106 137 11 1 0 9 13 162 0.2891156 0.3934659 87 0.5425170
## 2 100 30 6 80 71 5 4 9 9 9 157 0.3356522 0.4146707 127 0.5408696
## 3 108 5 1 57 57 14 3 0 8 18 136 0.3313840 0.3958692 111 0.5536062
## 4 97 1 3 47 83 2 0 3 3 14 156 0.3238994 0.3688047 127 0.5723270
## 5 130 11 3 106 135 5 6 0 5 20 161 0.2883333 0.3974895 87 0.5683333
## 6 126 7 9 92 121 5 13 0 6 8 156 0.3310225 0.4302326 97 0.6204506
## teamID.y lgID.y salary
## 1 PHI NL 4983000
## 2 CLE AL 7750000
## 3 HOU NL 5250000
## 4 SFN NL 3250000
## 5 HOU NL 6500000
## 6 HOU NL 305000
\[\\[0.01in]\]
\[\\[0.01in]\]
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
pl1 <- ggplot(potential) + aes(x=AB,y=OBP)
pl2 <- pl1 + geom_point(aes(color=salary))
pl3 <- pl2 + scale_colour_gradient(low = "blue", high = "red")
ggplotly(pl3)
\[\\[0.01in]\]
potential <- potential %>% filter(OBP<0.445,AB<650,salary<5000000)
pl1 <- ggplot(potential) + aes(x=AB,y=OBP)
pl2 <- pl1 + geom_point(aes(color=salary))
pl3 <- pl2 + scale_colour_gradient(low = "blue", high = "red")
ggplotly(pl3)
\[\\[0.01in]\]
potential <- potential %>% arrange(desc(AB)) %>% select(c("playerID","AB","OBP","salary"))
replacement <- potential[1:3,]
\[\\[0.01in]\]
## playerID AB OBP salary
## 1 stewash01 640 0.3710602 2183333
## 2 aurilri01 636 0.3688047 3250000
## 3 boonebr01 623 0.3722628 3250000
lost_players <- lost_players %>% select(c("playerID","AB","OBP"))
replacement <- replacement %>% select(c("playerID","AB","OBP"))
compare <- rbind(lost_players,replacement)
new_player <- function(players){
for (i in players) {
if (i %in% replacement$playerID) {
return(TRUE)
}else{
return(FALSE)
}
}
}
compare$new_player <- sapply(compare$playerID,new_player)
pl1 <- ggplot(compare) + aes(x=AB,y=OBP)
pl2 <- pl1 + geom_point(aes(color=new_player))
pl3 <- pl2 + facet_grid(AB~round(OBP,2))
ggplotly(pl3)